In [ ]:
from __future__ import division
import codecs
import pickle
import networkx as nx
from collections import Counter

rcParams['figure.figsize'] = (12.0, 10.0)
rcParams['font.family'] = 'Times New Roman'

In [ ]:
from os.path import abspath
workspace = "/".join(abspath('.').split('/')[:-1])

Note: Make sure that your workspace sees the root directory of openie_eval.


In [ ]:
from openie_eval.openie_eval import semantic_parsing as sp
from openie_eval.openie_eval import ontologization
reload(sp)
reload(ontologization)

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [ ]:
keyword = 'carnatic_music'

wiki_entities = codecs.open(workspace + '/data/ground-truth/'+keyword+'_pages.txt', encoding='utf-8').readlines()
wiki_entities = [i.strip().lower() for i in wiki_entities]

methods = ['reverb', 'openie', 'semantic-parsing']
labels = {'reverb': 'ReVerb', 'openie': 'OpenIE 4.0', 'semantic-parsing': 'Sem. Parsing'}
colors = ['#990033', '#006600', '#330066']

#coref_suffix = ''
coref_suffix = '-coref'

#filtered_suffix = ''
filtered_suffix = '-filtered'

Entity identification


In [ ]:
print len(wiki_entities)
for method in methods:
    relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
    relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
    candidate_entities = [i[0] for i in relations]
    overlap = set(candidate_entities).intersection(wiki_entities)
    residual = set(candidate_entities)-set(wiki_entities)
    print method, len(overlap), len(residual), round(len(overlap)/len(wiki_entities), 2), round(len(residual)/len(set(candidate_entities)), 2)

Rule-based

Create rules


In [ ]:
#carnatic
class_terms = {}
class_terms['carnatic_ragas'] = ['raga', 'raaga', 'scale']
class_terms['carnatic_singers'] = ['vocalist', 'singer']
class_terms['carnatic_composers'] = ['composer', 'poet']
class_terms['carnatic_instrumentalists'] = ['instrumentalist', 'player', 'violonist']
class_terms['carnatic_compositions'] = ['composition', 'song']
class_terms['carnatic_musicians'] = list(concatenate([class_terms[i] for i in ['carnatic_singers', 'carnatic_composers', 'carnatic_instrumentalists']]))
class_terms['carnatic_musicians'].append('artist')

out_file = workspace + '/data/results/qualitative/entity-identification/rule-based/carnatic_music/rules.pickle'
pickle.dump(class_terms, file(out_file, 'w'))

In [ ]:
#hindustani
class_terms = {}
class_terms['hindustani_ragas'] = ['raga', 'raaga', 'raag', 'rag', 'scale', u'rāga']
class_terms['hindustani_singers'] = ['vocalist', 'singer']
class_terms['hindustani_composers'] = ['composer', 'poet']
class_terms['hindustani_instrumentalists'] = ['instrumentalist', 'player', 'violonist']
#class_terms['carnatic_compositions'] = ['composition', 'song']
class_terms['hindustani_musicians'] = list(concatenate([class_terms[i] for i in ['hindustani_singers', 'hindustani_composers', 'hindustani_instrumentalists']]))
class_terms['hindustani_musicians'].append('artist')

out_file = workspace + '/data/results/qualitative/entity-identification/rule-based/hindustani_music/rules.pickle'
pickle.dump(class_terms, file(out_file, 'w'))

Class assignment


In [ ]:
keyword = 'hindustani_music'

coverage = {}
labelled_class_instances = {}

rules = pickle.load(file(workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/rules.pickle'))
groundtruth = ontologization.load_groundtruth(keyword, rules.keys())
class_terms = pickle.load(file(workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/rules.pickle'))

for method in methods:
    relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
    relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
    
    class_instances = ontologization.class_instances_by_rules(relations, rules)
    res = ontologization.analyze_coverage(class_instances, groundtruth)
    coverage[method] = res['coverage']
    labelled_class_instances[method] = res['labelled_class_instances']

In [ ]:
def label_numbers(rects, numbers):
    # attach some text labels
    for i in xrange(len(rects)):
        rect = rects[i]
        text_label = str(numbers[i])
        if text_label == '0':
            continue
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., height-0.03, '%s'%(text_label),
                fontsize=22, ha='center', va='bottom', color='w')

In [ ]:
rcParams['figure.figsize'] = (12.0, 10.0)

fig, ax = plt.subplots()

bar_width = 0.2
index = arange(len(class_terms))

count = 0
all_fp_ratios = []

for method in methods:
    overlap_scores = [i[0] for i in coverage[method]]
    rects = bar(index, overlap_scores, width=bar_width, color=colors[count], label=labels[method])
    label_numbers(rects, [len(labelled_class_instances[method][i]['tp']) for i in class_terms.keys()])
    fp_ratios = [i[1] for i in coverage[method]]
    all_fp_ratios.extend(zip(index+bar_width/2.0, fp_ratios))
    index = index+bar_width
    count += 1
    
all_fp_ratios = array(sorted(all_fp_ratios, key=lambda x:x[0]))
stem(all_fp_ratios[:, 0], all_fp_ratios[:, 1], linefmt='k--', markerfmt='ko')

fontsize=30
xlabel('Concepts', fontsize=fontsize+2)
ylabel('Overlap ($O$) with reference data', fontsize=fontsize+2)
if keyword == 'carnatic_music':
    xticks(index-1.5*bar_width, [i[9:] for i in class_terms.keys()])
else:
    xticks(index-1.5*bar_width, [i[11:] for i in class_terms.keys()])
legend(prop={'size': fontsize}, loc='upper left', 
       fancybox=True)

xticks(fontsize=fontsize, rotation=14)
yticks(fontsize=fontsize)

In [ ]:
ylim(0, 0.74)

In [ ]:
fname = workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/class-agreement-with-wikipedia'
savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)

In [ ]:
close('all')

In [ ]:
agreement_scores = ontologization.compute_agreement(labelled_class_instances, methods)

In [ ]:
rcParams['figure.figsize'] = (12.0, 10.0)
inter_labels = {'reverb-openie': 'ReVerb-OpenIE 4.0', 'openie-semantic-parsing': 'OpenIE 4.0-Sem. Parsing', 
                'reverb-semantic-parsing': 'Sem. Parsing-ReVerb'}

fig, ax = plt.subplots()

bar_width = 0.2
index = arange(len(class_terms))
count = 0
for method, res in agreement_scores.items():
    scores = [i[0] for i in res]
    abs_numbers = [len(i[1]) for i in res]
    
    rects = bar(index, scores, bar_width, color=colors[count], label=inter_labels[method])
    label_numbers(rects, abs_numbers)
    
    index = index+bar_width
    count += 1

fontsize=30
xlabel('Concepts', fontsize=fontsize+2)
ylabel('Inter-system agreement over $R$', fontsize=fontsize+2)
if keyword == 'carnatic_music':
    xticks(index-1.5*bar_width, [i[9:] for i in class_terms.keys()])
else:
    xticks(index-1.5*bar_width, [i[11:] for i in class_terms.keys()])
legend(prop={'size': fontsize}, loc='upper center', 
       bbox_to_anchor=(0.5, 1.2), fancybox=True)

xticks(fontsize=fontsize, rotation=10)
yticks(fontsize=fontsize)

In [ ]:
ylim(0, 1.05)

In [ ]:
fname = workspace + '/data/results/qualitative/entity-identification/rule-based/'+keyword+'/class-agreement-inter-method'
savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)

Bootstrapping

Distance measure: Cosine similarity between split-object vectors of seedset and the given entity. For seedset, we consider those split objects which occur more than once.

  • Iterate over the set and select the nearest to the seedset
  • Merge it with seedset and recompute it's split-object vector
  • Re-iterate

Variables to play with:

  • Seedset size
  • Iterations

In [ ]:
from random import shuffle
reload(ontologization)

#NOTE: Run rule-based before running this, it uses class_instances and the groundtruth from that!!

In [ ]:
def get_seedset(class_instances, n=3):
    seedset = {}
    for class_type, instances in class_instances.items():
        shuffle(instances)
        seedset[class_type] = instances[:n]
    return seedset

In [ ]:
coverage = {}
iteration_step = 5

for method in methods:
    coverage[method] = {}
    
    relations = pickle.load(file(workspace + '/data/'+method+'/'+keyword+'/relations'+coref_suffix+filtered_suffix+'.pickle'))
    relations = [[i['arg1'].lower(), lemmatizer.lemmatize(i['rel'].lower(), pos='v'), i['arg2'].lower()] for i in relations]
        
    predicates = ontologization.get_predicates(relations, normalization=False)
    objects = ontologization.get_objects(relations, split=True, normalization=True)
    class_instances = ontologization.class_instances_by_rules(relations, rules)
    
    n_seedsets = 5
    for n_seedset in xrange(n_seedsets): 
        seedset = get_seedset(class_instances, 3)
        
        total_iterations = 0
        
        for class_type in seedset.keys():
            if class_type not in coverage[method].keys():
                coverage[method][class_type] = []
                
            bootstrap_iterator = ontologization.bootstrap_lsa(seedset[class_type], objects, predicates, 
                                               expansion=1, iterations=len(groundtruth[class_type]), yield_step=iteration_step)
            iter_count = 1
            while True:
                try:
                    res = bootstrap_iterator.next()
                    overlap_score = ontologization.overlap(res, groundtruth[class_type])
                    fp_ratio = len(set(res)-set(groundtruth[class_type]))/len(res)
                    if len(coverage[method][class_type]) <= iter_count:
                        coverage[method][class_type].append([overlap_score, fp_ratio])
                    else:
                        coverage[method][class_type][iter_count-1][0] += overlap_score
                        coverage[method][class_type][iter_count-1][0] /= 2.0
                        coverage[method][class_type][iter_count-1][1] += fp_ratio
                        coverage[method][class_type][iter_count-1][1] /= 2.0
                    iter_count += 1
                except StopIteration:
                    break

In [ ]:
coverage

In [ ]:
import itertools
def flip(items, ncol):
    return itertools.chain(*[items[i::ncol] for i in range(ncol)])

In [ ]:
rcParams['figure.figsize'] = (12.0, 10.0)
styles = ['-', '--']
for class_type in seedset.keys():
    fig = figure()
    ax = fig.add_subplot(1,1,1)
    
    count = 0
    for method in methods:
        y1 = [i[0] for i in coverage[method][class_type]]
        y2 = [i[1] for i in coverage[method][class_type]]
        x = arange(1, len(y1)+1)*iteration_step
        plot(x, y1, styles[0], color=colors[count], label=labels[method], linewidth=2.5)
        plot(x, y2, styles[1], color=colors[count], linewidth=2.5)
        count += 1
    
    fontsize=30
    xlabel('No. of entities bootstrapped', fontsize=fontsize+2)
    #ylabel('Num. of instances bootstrapped', fontsize=fontsize+2)

    #Get artists and labels for legend
    handles, _labels = ax.get_legend_handles_labels()
    
    #Create custom artists
    custom_artists = []
    custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='-'))
    custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='--'))
    
    ax.legend(flip(handles+custom_artists, 3),
              flip(_labels + ['Overlap ($O$)', 'Residual ($R$)'], 3),
              ncol=3, prop={'size': fontsize-6},
              loc='upper center', bbox_to_anchor=(0.5, 1.1),
              fancybox=True)
    
    xticks(fontsize=fontsize)
    yticks(fontsize=fontsize)
    ylim_down, ylim_up = ax.get_ylim()
    ylim(ylim_down, ylim_up*1.05)
    xlim_down, xlim_up = ax.get_xlim()
    xlim(xlim_down, xlim_up*0.95)
    
    grid(True)
    xgridlines = getp(gca(), 'xgridlines')
    ygridlines = getp(gca(), 'ygridlines')
    setp(xgridlines, 'color', '0.6')
    setp(ygridlines, 'color', '0.6')
    
    fname = workspace + '/data/results/qualitative/entity-identification/bootstrapping/'+keyword+'/'+class_type
    savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
            papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
    savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
            papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
    close()

In [ ]:
close('all')